ANOVA for Feature Selection

# !pip install dataidea==0.2.5
import scipy as sp
from sklearn.feature_selection import SelectKBest
from dataidea.datasets import loadDataset
fpl = loadDataset('fpl') # load fpl inbuilt
fpl.head(n=5) # select top 5
First_Name Second_Name Club Goals_Scored Assists Total_Points Minutes Saves Goals_Conceded Creativity Influence Threat Bonus BPS ICT_Index Clean_Sheets Red_Cards Yellow_Cards Position
0 Bruno Fernandes MUN 18 14 244 3101 0 36 1414.9 1292.6 1253 36 870 396.2 13 0 6 MID
1 Harry Kane TOT 23 14 242 3083 0 39 659.1 1318.2 1585 40 880 355.9 12 0 1 FWD
2 Mohamed Salah LIV 22 6 231 3077 0 41 825.7 1056.0 1980 21 657 385.8 11 0 0 MID
3 Heung-Min Son TOT 17 11 228 3119 0 36 1049.9 1052.2 1046 26 777 315.2 13 0 0 MID
4 Patrick Bamford LEE 17 11 194 3052 0 50 371.0 867.2 1512 26 631 274.6 10 0 3 FWD
# Create groups of goals scored for each player position

forwards_goals = fpl[fpl.Position == 'FWD']['Goals_Scored']
midfielders_goals = fpl[fpl.Position == 'MID']['Goals_Scored']
defenders_goals = fpl[fpl.Position == 'DEF']['Goals_Scored']
goalkeepers_goals = fpl[fpl.Position == 'GK']['Goals_Scored']
# Perform the ANOVA test for the groups

f_statistic, p_value = sp.stats.f_oneway(forwards_goals, midfielders_goals,
                                         defenders_goals, goalkeepers_goals
                                        )
print("F-statistic:", f_statistic)
print("p-value:", p_value)
F-statistic: 33.281034594400445
p-value: 3.9257634156019246e-20
# Create groups of assists for each player position

forwards_assists = fpl[fpl.Position == 'FWD']['Assists']
midfielders_assists = fpl[fpl.Position == 'MID']['Assists']
defenders_assists = fpl[fpl.Position == 'DEF']['Assists']
goalkeepers_assists = fpl[fpl.Position == 'GK']['Assists']
# Perform the ANOVA test for the groups

f_statistic, p_value = sp.stats.f_oneway(forwards_assists, midfielders_assists,
                                         defenders_assists, goalkeepers_assists
                                        )
print("F-statistic:", f_statistic)
print("p-value:", p_value)
F-statistic: 19.263717036430815
p-value: 5.124889288362087e-12
# Use scikit-learn's SelectKBest (with f_classif)

test = SelectKBest(k=1)
fit = test.fit(fpl[['Goals_Scored', 'Assists']], fpl.Position)
scores = fit.scores_
features = fit.transform(fpl[['Goals_Scored', 'Assists']])
selected_indices = test.get_support(indices=True)

print('Feature Scores: ', scores)
print('Selected Features Indices: ', selected_indices)
Feature Scores:  [33.28103459 19.26371704]
Selected Features Indices:  [0]
Back to top